{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import joblib\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from xgboost import XGBClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 25.4 s, sys: 5.34 s, total: 30.7 s\n",
      "Wall time: 30.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# 导入训练数据集\n",
    "data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])\n",
    "data_raw = joblib.load('../../preprocess_data_new/train_ax_nodup.lz4').head(33465).drop(columns=['loan_dt','id','tag'])\n",
    "data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])\n",
    "data_tag = pd.read_csv('../../preprocess_data/train_x_33465.csv',usecols=['tag'])\n",
    "week_df = joblib.load('../../preprocess_data_discrete/week_df.lz4').head(33465)\n",
    "data = pd.concat([data_date,data_raw,data_null,data_tag,week_df],axis=1,copy=False)\n",
    "data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])\n",
    "x = data.fillna(-1).values\n",
    "y = data_label.values.ravel()\n",
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.91 s, sys: 592 ms, total: 6.5 s\n",
      "Wall time: 10 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['train_data.lz4']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "joblib.dump(data,'train_data.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### num_boost_round"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "param_test1 = {'n_estimators':range(150,300,10)}\n",
    "xgb = XGBClassifier(max_depth=5,\n",
    "                      learning_rate =0.05, \n",
    "                      booster='gbtree',\n",
    "                      objective='binary:logistic',\n",
    "                      early_stopping_rounds=100,\n",
    "                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                      eval_metric='auc',\n",
    "                      gamma=1,\n",
    "                      reg_lambda=1,\n",
    "                      subsample=0.9,\n",
    "                      min_child_weight=1,\n",
    "                      seed=2018,\n",
    "                      silent=False,\n",
    "                      n_jobs=24\n",
    "                             )\n",
    "gsearch1 = GridSearchCV(estimator = xgb, param_grid = param_test1, scoring='roc_auc',cv=5,n_jobs=8)\n",
    "gsearch1.fit(x,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "joblib.dump(gsearch1,'./xgb/gsearch1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({'n_estimators': 220}, 0.8262723491029047)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsearch1.best_params_,gsearch1.best_score_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### max_depth"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_test2 = {'max_depth':range(3,8,1)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 29min 42s, sys: 10 s, total: 29min 52s\n",
      "Wall time: 41min 7s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "xgb = XGBClassifier(n_estimators = 220,\n",
    "                      learning_rate =0.05, \n",
    "                      booster='gbtree',\n",
    "                      objective='binary:logistic',\n",
    "                      early_stopping_rounds=100,\n",
    "                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                      eval_metric='auc',\n",
    "                      gamma=1,\n",
    "                      reg_lambda=1,\n",
    "                      subsample=0.9,\n",
    "                      min_child_weight=1,\n",
    "                      seed=2018,\n",
    "                      silent=False,\n",
    "                      n_jobs=24\n",
    "                             )\n",
    "gsearch2 = GridSearchCV(estimator = xgb, param_grid = param_test2, scoring='roc_auc',cv=5,n_jobs=8)\n",
    "gsearch2.fit(x,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./xgb/gsearch2']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import joblib\n",
    "joblib.dump(gsearch2,'./xgb/gsearch2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({'max_depth': 3}, 0.8313512310612271)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsearch2.best_params_,gsearch2.best_score_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### min_child_weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
       "       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "       colsample_bytree=1, early_stopping_rounds=100, eval_metric='auc',\n",
       "       gamma=1, learning_rate=0.05, max_delta_step=0, max_depth=3,\n",
       "       min_child_weight=1, missing=None, n_estimators=220, n_jobs=24,\n",
       "       nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=14.22520473157416,\n",
       "       seed=2018, silent=False, subsample=0.9),\n",
       "       fit_params=None, iid='warn', n_jobs=8,\n",
       "       param_grid={'min_child_weight': range(1, 6)},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring='roc_auc', verbose=0)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param_test3 = {'min_child_weight':range(1,6,1)}\n",
    "xgb = XGBClassifier(n_estimators = 220,\n",
    "                      max_depth = 3,\n",
    "                      learning_rate =0.05, \n",
    "                      booster='gbtree',\n",
    "                      objective='binary:logistic',\n",
    "                      early_stopping_rounds=100,\n",
    "                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                      eval_metric='auc',\n",
    "                      gamma=1,\n",
    "                      reg_lambda=1,\n",
    "                      subsample=0.9,\n",
    "                      seed=2018,\n",
    "                      silent=False,\n",
    "                      n_jobs=24\n",
    "                             )\n",
    "gsearch3 = GridSearchCV(estimator = xgb, param_grid = param_test3, scoring='roc_auc',cv=5,n_jobs=8)\n",
    "gsearch3.fit(x,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./xgb/gsearch3']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import joblib\n",
    "joblib.dump(gsearch3,'./xgb/gsearch3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({'min_child_weight': 2}, 0.8314357495296593)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsearch3.best_params_,gsearch3.best_score_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### subsample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9h 53min 49s, sys: 8min 26s, total: 10h 2min 15s\n",
      "Wall time: 31min 31s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "param_test4 = {\n",
    " 'subsample':[i/10.0 for i in range(6,10,1)],\n",
    "#  'colsample_bytree':[i/10.0 for i in range(6,10,1)]\n",
    "}\n",
    "xgb = XGBClassifier(n_estimators = 220,\n",
    "                      max_depth = 3,\n",
    "                      min_child_weight = 2,\n",
    "                      learning_rate =0.05, \n",
    "                      booster='gbtree',\n",
    "                      objective='binary:logistic',\n",
    "                      early_stopping_rounds=100,\n",
    "                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                      eval_metric='auc',\n",
    "                      gamma=1,\n",
    "                      reg_lambda=1,\n",
    "                      seed=2018,\n",
    "                      silent=False,\n",
    "                      n_jobs=24\n",
    "                             )\n",
    "gsearch4 = GridSearchCV(estimator = xgb, param_grid = param_test4, scoring='roc_auc',cv=5,n_jobs=1,pre_dispatch=8)\n",
    "gsearch4.fit(x,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./xgb/gsearch4']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import joblib\n",
    "joblib.dump(gsearch4,'./xgb/gsearch4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({'subsample': 0.9}, 0.8314357495296593)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gsearch4.best_params_,gsearch4.best_score_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 综合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb = XGBClassifier(n_estimators = 220,\n",
    "                      max_depth = 3,\n",
    "                      min_child_weight = 2,\n",
    "                      learning_rate =0.05, \n",
    "                      subsample = 0.9,\n",
    "                      booster='gbtree',\n",
    "                      objective='binary:logistic',\n",
    "                      early_stopping_rounds=100,\n",
    "                      scale_pos_weight=float(len(y)-np.sum(y))/float(np.sum(y)),\n",
    "                      eval_metric='auc',\n",
    "                      gamma=1,\n",
    "                      reg_lambda=1,\n",
    "                      seed=2018,\n",
    "                      silent=False,\n",
    "                      n_jobs=24\n",
    "                             )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
